import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
#!pip install plotly_express==0.4.0
df = pd.read_csv("C:/Users/Zehra Salmani/Desktop/extras/python/Instagram data.csv",encoding="ISO-8859-1")
df.head()
| Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | Caption | Hashtags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3920 | 2586 | 1028 | 619 | 56 | 98 | 9 | 5 | 162 | 35 | 2 | Here are some of the most important data visua... | #finance #money #business #investing #investme... |
| 1 | 5394 | 2727 | 1838 | 1174 | 78 | 194 | 7 | 14 | 224 | 48 | 10 | Here are some of the best data science project... | #healthcare #health #covid #data #datascience ... |
| 2 | 4021 | 2085 | 1188 | 0 | 533 | 41 | 11 | 1 | 131 | 62 | 12 | Learn how to train a machine learning model an... | #data #datascience #dataanalysis #dataanalytic... |
| 3 | 4528 | 2700 | 621 | 932 | 73 | 172 | 10 | 7 | 213 | 23 | 8 | Heres how you can write a Python program to d... | #python #pythonprogramming #pythonprojects #py... |
| 4 | 2518 | 1704 | 255 | 279 | 37 | 96 | 5 | 4 | 123 | 8 | 0 | Plotting annotations while visualizing your da... | #datavisualization #datascience #data #dataana... |
df.shape
(119, 13)
df.dtypes
Impressions int64 From Home int64 From Hashtags int64 From Explore int64 From Other int64 Saves int64 Comments int64 Shares int64 Likes int64 Profile Visits int64 Follows int64 Caption object Hashtags object dtype: object
df.isnull().sum()
Impressions 0 From Home 0 From Hashtags 0 From Explore 0 From Other 0 Saves 0 Comments 0 Shares 0 Likes 0 Profile Visits 0 Follows 0 Caption 0 Hashtags 0 dtype: int64
plt.figure(figsize=(8,6))
plt.title("Impressions from Home")
sns.distplot(df['From Home'])
plt.show()
plt.figure(figsize=(8,6))
plt.title("Impressions from Hashtags")
sns.distplot(df['From Hashtags'])
plt.show()
plt.figure(figsize=(8,6))
plt.title("Impressions from Explore")
sns.distplot(df['From Explore'])
plt.show()
plt.figure(figsize=(8,6))
plt.title("Impressions from Other")
sns.distplot(df['From Other'])
plt.show()
home = df['From Home'].sum()
hashtags = df['From Hashtags'].sum()
explore = df['From Explore'].sum()
others = df['From Other'].sum()
labels = ['From Home','From Hashtags','From Explore','From Other']
values = [home,hashtags,explore,others]
fig = px.pie(df,names = labels,values = values,title='Impressions from Various Sources',hole=0.5)
fig.show()
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
#!pip install wordcloud
#for all captions
text = ' '.join(i for i in df.Caption)
print("There are {} words in combination of all captions ".format(len(text)))
There are 23021 words in combination of all captions
stopwords = set(STOPWORDS)
#generate a word cloud image
wordcloud = WordCloud(mode = "RGBA",width = 800,height=500,max_font_size=100,stopwords=stopwords,max_words=80,
background_color=None).generate(text)
plt.imshow(wordcloud,interpolation = 'quadric')
plt.axis("off")
plt.show()
#for all hashtags
text1 = ' '.join(i for i in df.Hashtags)
print("There are {} words in combination of all hashtags ".format(len(text)))
There are 23021 words in combination of all hashtags
stopwords = set(STOPWORDS)
#generate a word cloud image
wordcloud = WordCloud(mode = "RGBA",width = 500,height=300,max_font_size=100,stopwords=stopwords,max_words=50,
background_color=None).generate(text1)
plt.imshow(wordcloud,interpolation = 'quadric')
plt.axis("off")
plt.show()
fig = px.scatter(data_frame = df,x='Impressions',y='Likes',size='Likes',trendline = 'ols',
title = 'Relationship between Impressions and Likes')
fig.show()
fig = px.scatter(data_frame = df,x='Impressions',y='Comments',size='Comments',trendline = 'ols',
title = 'Relationship between Impressions and Comments')
fig.show()
fig = px.scatter(data_frame = df,x='Impressions',y='Shares',size='Shares',trendline = 'ols',
title = 'Relationship between Impressions and Shares')
fig.show()
fig = px.scatter(data_frame = df,x='Impressions',y='Saves',size='Saves',trendline = 'ols',
title = 'Relationship between Impressions and Saves')
fig.show()
fig = px.scatter(data_frame = df,x='Impressions',y='Profile Visits',size='Profile Visits',trendline = 'ols',
title = 'Relationship between Impressions and Profile Visits')
fig.show()
df.corr()
| Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Impressions | 1.000000 | 0.844698 | 0.560760 | 0.893607 | 0.592960 | 0.779231 | -0.028524 | 0.634675 | 0.849835 | 0.760981 | 0.889363 |
| From Home | 0.844698 | 1.000000 | 0.177516 | 0.800573 | 0.555666 | 0.768817 | 0.012716 | 0.674985 | 0.698330 | 0.531076 | 0.672675 |
| From Hashtags | 0.560760 | 0.177516 | 1.000000 | 0.190453 | 0.229623 | 0.305929 | 0.161439 | 0.219511 | 0.662124 | 0.691345 | 0.555485 |
| From Explore | 0.893607 | 0.800573 | 0.190453 | 1.000000 | 0.495685 | 0.747803 | -0.158565 | 0.615731 | 0.653699 | 0.531850 | 0.796019 |
| From Other | 0.592960 | 0.555666 | 0.229623 | 0.495685 | 1.000000 | 0.331907 | -0.108703 | 0.156834 | 0.393510 | 0.633080 | 0.546737 |
| Saves | 0.779231 | 0.768817 | 0.305929 | 0.747803 | 0.331907 | 1.000000 | -0.026912 | 0.860324 | 0.845643 | 0.360628 | 0.628461 |
| Comments | -0.028524 | 0.012716 | 0.161439 | -0.158565 | -0.108703 | -0.026912 | 1.000000 | 0.016933 | 0.123586 | 0.096714 | -0.060631 |
| Shares | 0.634675 | 0.674985 | 0.219511 | 0.615731 | 0.156834 | 0.860324 | 0.016933 | 1.000000 | 0.707794 | 0.245361 | 0.493070 |
| Likes | 0.849835 | 0.698330 | 0.662124 | 0.653699 | 0.393510 | 0.845643 | 0.123586 | 0.707794 | 1.000000 | 0.626107 | 0.746333 |
| Profile Visits | 0.760981 | 0.531076 | 0.691345 | 0.531850 | 0.633080 | 0.360628 | 0.096714 | 0.245361 | 0.626107 | 1.000000 | 0.853152 |
| Follows | 0.889363 | 0.672675 | 0.555485 | 0.796019 | 0.546737 | 0.628461 | -0.060631 | 0.493070 | 0.746333 | 0.853152 | 1.000000 |
a = df.corr()
a['Impressions'].sort_values(ascending = False)
Impressions 1.000000 From Explore 0.893607 Follows 0.889363 Likes 0.849835 From Home 0.844698 Saves 0.779231 Profile Visits 0.760981 Shares 0.634675 From Other 0.592960 From Hashtags 0.560760 Comments -0.028524 Name: Impressions, dtype: float64
conversation_rate = (df['Follows'].sum()/df['Profile Visits'].sum())*100
print(conversation_rate)
41.00265604249668
fig = px.scatter(data_frame = df,y='Follows',x='Profile Visits',size='Profile Visits',trendline = 'ols',
title = 'Relationship between Profile Visits and Follows ')
fig.show()
df.columns
Index(['Impressions', 'From Home', 'From Hashtags', 'From Explore',
'From Other', 'Saves', 'Comments', 'Shares', 'Likes', 'Profile Visits',
'Follows', 'Caption', 'Hashtags'],
dtype='object')
x = np.array(df[['Saves', 'Comments', 'Shares', 'Likes', 'Profile Visits',
'Follows']])
y = np.array(df['Impressions'])
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size = 0.2)
model = PassiveAggressiveRegressor()
model.fit(x_train,y_train)
PassiveAggressiveRegressor()
model.predict(x_test)
array([ 6230.79478639, 9569.85722763, 3811.04600369, 7668.73069153,
5797.44215179, 29597.41308771, 3969.06578757, 2977.03593739,
13877.74412764, 4991.34990147, 6871.64146821, 9898.47750504,
4843.31138824, 3601.15659496, 5408.08030377, 4666.33705043,
4008.83479828, 5463.62461192, 5059.12675504, 6043.55445996,
11514.06016197, 21789.62933606, 5797.44215179, 7293.3771316 ])
model.score(x_test,y_test)
0.8883558388190361
# Features = [['Saves', 'Comments', 'Shares', 'Likes', 'Profile Visits','Follows']]
features = np.array([[233.0,4.0,9.0,282.0,165.0, 54.0]])
model.predict(features)
array([13533.46854936])